# -*- coding: utf-8 -*-

import os, jieba
from gensim import corpora, models, similarities

files_dir = '../static/files/texts/'

def cut(stopwords, txt):
	result = []
	for word in jieba.cut(txt):
		if word not in stopwords:
			result.append(word)

	return result

def save():
	with open('../static/stopwords.txt') as f:
		stopwords = [line.strip() for line in f.readlines()]

	texts = []
	for file in os.listdir(files_dir):
		with open(files_dir + file) as f: texts.append(f.read())

	split_texts = [cut(stopwords, text) for text in texts]

	dictionary = corpora.Dictionary(split_texts)
	dictionary.save('../static/files/tmp/plan.dict')

	# 将语料库以稀疏坐标矩阵集存储至硬盘
	corpus_vector = [dictionary.doc2bow(text) for text in split_texts]
	corpora.MmCorpus.serialize('../static/files/tmp/plan.corpus', corpus_vector)

	tfidf_model = models.TfidfModel(corpus_vector)
	tfidf_model.save('../static/files/tmp/plan.model')

	# print(1)
	# tfidf_corpus = tfidf_model[corpus_vector]
	# print(2)

def search(stopwords, target):
	dictionary = corpora.Dictionary.load('../static/files/tmp/plan.dict')
	tfidf_model = models.TfidfModel.load('../static/files/tmp/plan.model')
	corpus_vector = corpora.MmCorpus('../static/files/tmp/plan.corpus')

	target_vector = dictionary.doc2bow(cut(stopwords, target))
	tfidf_target = tfidf_model[target_vector]

	tfidf_corpus = tfidf_model[corpus_vector]
	index = similarities.MatrixSimilarity(tfidf_corpus)
	sim = index[tfidf_target]
	print(sim)

if __name__ == '__main__':
	# save()

	with open('../static/stopwords.txt') as f:
		stopwords = [line.strip() for line in f.readlines()]
	search(stopwords, '面向非均质储层勘探开发，以空隙结构特征研究为基础，开展流体响应机理研究，主要研究内容有数字岩心孔隙结构分析方法、孔隙结构地震反射特征定量表征方法、频变流体因子反演方法、孔隙结构约束的流体因子构建方法和阻抗域流体因子构建及应用。主要承担单位有中国石油集团科学技术研究院有限功公司和中国石油集团东方地球物理勘探有限责任公司，期望在鄂尔多斯盆地、四川盆地应用。')
	search(stopwords, '海外重点区域勘探技术研究，南苏丹、苏丹重点盆地勘探领域评价与目标优选，主要研究内容有中西非裂谷系不同凹陷沉降曲线、构造、沉积响应、成藏组合评价、成藏模式，中西非原油、岩屑样品地球化学分析，Melut盆地北部凹陷重点地区岩性地层圈闭成藏条件，Melut盆地北部凹陷和Muglad盆地S1/2/4区低阻油层识别评价，Melut盆地低勘探程度区和Muglad盆地6区Kaikang槽“三新”领域成藏条件及目标优选，Melut盆地南部火成岩发育区地震资料重新处理，主要承担单位有中国石油集团科学技术研究院有限公司、中国石油集团西北地质研究所有限公司、中国石油集团东方地球物理勘探有限责任公司和中估计石油集团测井有限公司。')
	search(stopwords, '面向大庆油田萨北开发区的勘探技术研究，面对萨北开发区特高含水期油田开发，研究内容包括三次采油、水驱调整，过渡带交替注聚，分析二类油层储层的沉积特征，期望增加可采储量。主要承担单位有中国中国石油集团科学技术研究院有限公司和中国石油大庆油田有限责任公司。')

	# with open('../static/stopwords.txt') as f:
	# 	stopwords = [line.strip() for line in f.readlines()]

	# texts = []
	# for file in os.listdir(files_dir):
	# 	with open(files_dir + file) as f: texts.append(f.read())

	# split_texts = [cut(stopwords, text) for text in texts]

	# dictionary = corpora.Dictionary(split_texts)
	# # print(dictionary)
	# # dictionary.save('../static/files/tmp/dicts')

	# # d1 = dictionary.load('../static/files/tmp/dicts')
	# # print(d1)
	# # print(dictionary == d1, dictionary.token2id == d1.token2id)

	# corpus_vector = [dictionary.doc2bow(text) for text in split_texts]
	# # print(corpus_vector)

	# tfidf_model = models.TfidfModel(corpus_vector)

	# # for cv in corpus_vector:
	# # 	print(tfidf_model[cv])

	# # for i in range(len(corpus_vector)):
	# # 	print(tfidf_model[corpus_vector[i]])

	# # print(tfidf_model[corpus_vector])
	# # vector = tfidf_model[corpus_vector[0]] #将模型应用于第一个语料库文档
	# # print(vector)

	# tran_corpus = tfidf_model[corpus_vector]
	# print(tran_corpus)
	# print(type(tran_corpus))
	# print(tran_corpus.save)
	# # for x in tran_corpus:
	# # 	print(x)

	# index = similarities.MatrixSimilarity(tran_corpus)
	# print(index)

	# fns = os.listdir(files_dir)

	# # for fni in fns:
	# texts = []
	# with open(files_dir + fns[0]) as f:
	# 	line = f.readline()
	# 	while line:
	# 		texts.append(line)
	# 		line = f.readline()

	# # for x in texts:
	# # 	# print(x.replace('\n', ''))
	# # 	x = x.strip()
	# # 	if x:
	# # 		print(cut(stopwords, x))
	# 		# print(jieba.lcut(x))
	# 	# print(cut(stopwords, x.replace('\n', '')))

	# # split_texts = [cut(stopwords, x.strip()) for x in texts]
	# split_texts = [r for r in [cut(stopwords, x.strip()) for x in texts] if r]

	# dictionary = corpora.Dictionary(split_texts)
	# print(dictionary)
	# # print(dictionary.token2id)

	# corpus = [dictionary.doc2bow(text) for text in split_texts]
	# # print(type(corpus))
	# # print(len(corpus))
	# # print(type(corpus[0]))
	# # print(type(corpus[0][0]))
	# # print(type(corpus[0][0][1]))
	# # print(corpus)

	# tfidf = models.TfidfModel(corpus)
	# # print(tfidf)
	# # print(tfidf.wv)

	# tfidf_corpus = tfidf[corpus]
	# print(1, tfidf_corpus)

	# index = similarities.MatrixSimilarity(tfidf_corpus)
	# print(index)

	# target = '面向非均质储层勘探开发，以空隙结构特征研究为基础，开展流体响应机理研究，主要研究内容有数字岩心孔隙结构分析方法、孔隙结构地震反射特征定量表征方法、频变流体因子反演方法、孔隙结构约束的流体因子构建方法和阻抗域流体因子构建及应用。主要承担单位有中国石油集团科学技术研究院有限功公司和中国石油集团东方地球物理勘探有限责任公司，期望在鄂尔多斯盆地、四川盆地应用。'
	# target_vector = dictionary.doc2bow(cut(stopwords, target))
	# print(target_vector)
	# tfidf_target = tfidf[target_vector]

	# sim = index[tfidf_target]
	# # print('************************************************')
	# # print(sim)
